In this document, we build a two part mixture in brms which captures the proportion of responses that follow a linear log odds (LLO) pattern vs a constant response pattern. The LLO model follows from related work suggesting that the human perception of probability is encoded on a log odds scale. On this scale, the slope of a linear model represents the shape and severity of the function describing bias in probability perception. The greater the deviation of from a slope of 1 (i.e., ideal performance), the more biased the judgments of probability. Slopes less than one correspond to the kind of bias predicted by excessive attention to the mean. On the same log odds scale, the intercept is a crossover-point which should be proportional to the number of categories of possible outcomes among which probability is divided. In our case, the intercept should be about 0.5 since workers are judging the probability of team A vs team B winning a round of a simulated game.

Although we start by building up the LLO model on its own, there are a large number of responses near 50% (the middle of the probability scale) which cannot be accounted for by the LLO model. Thus, we add a mixture component for a random constant response per subject and model the proportion of reponses which match this patter depending on the visualization condition.

Prepare Data

We load worker responses from our pilot

# read in data 
responses_df <- read_csv("pilot-anonymous.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   workerId = col_character(),
##   condition = col_character(),
##   batch = col_integer(),
##   counterbalance = col_integer(),
##   numeracy = col_integer(),
##   bet = col_integer(),
##   outcome = col_character(),
##   sdDiff = col_integer(),
##   trial = col_integer(),
##   trialIdx = col_integer(),
##   repaired = col_character()
## )
## See spec(...) for full column specifications.
# rename to convert away from camel case
responses_df <- responses_df %>%
  rename(
    ground_truth=groundTruth,
    sd_diff=sdDiff,
    worker_id=workerId,
    start_time=startTime,
    resp_time=respTime,
    trial_dur=trialDur
  ) %>%
  mutate(
    trial_dur = ifelse(trial_dur < 0, 0, trial_dur), # avoid negative trial durations from faulty reconstruction (only one case)
    cles = ifelse(cles == 0, 0.25, cles),            # avoid responses equal to zero
    cles = ifelse(cles == 100, 99.75, cles),         # avoid responses equal to one-hundred
    bet = ifelse(bet == 1000, 999.75, bet)           # avoid responses equal to one-thousand
  ) 

head(responses_df)
## # A tibble: 6 x 22
##   worker_id condition batch counterbalance totalBonus duration numeracy
##   <chr>     <chr>     <int>          <int>      <dbl>    <dbl>    <int>
## 1 7553db84  HOPs          5              5      0.858     687.        6
## 2 7553db84  HOPs          5              5      0.858     687.        6
## 3 7553db84  HOPs          5              5      0.858     687.        6
## 4 7553db84  HOPs          5              5      0.858     687.        6
## 5 7553db84  HOPs          5              5      0.858     687.        6
## 6 7553db84  HOPs          5              5      0.858     687.        6
## # ... with 15 more variables: bet <dbl>, bonus <dbl>, cles <dbl>,
## #   ground_truth <dbl>, keep <dbl>, outcome <chr>, pay <dbl>,
## #   resp_time <dbl>, sd_diff <int>, start_time <dbl>, trial <int>,
## #   trial_dur <dbl>, trialIdx <int>, win <dbl>, repaired <chr>

We also load the data that was used to generate the stimuli that users saw in the pilot.

# data used to create stimuli
load("./conds_df.Rda")

Now, we’ll also process this data to prepare it for modeling.

# calcate the difference in draws for the heuristic functions
draw_differences <- conds_df %>% select(condition, Team, draws) %>% 
  spread(Team, draws) %>% 
  unnest() %>% 
  mutate(
    draws_diff=B - A, 
    A=NULL, 
    B=NULL
  ) %>% 
  group_by(condition) %>% 
  summarise(draws_diff = list(draws_diff[1:50]))

# reformat data conditions df
stimuli_data_df <- conds_df %>% 
  filter(Team %in% "A") %>% # drop duplicate rows for two teams
  left_join(draw_differences, by='condition') %>%
  mutate( # drop unnecessary columns
    condition=NULL,
    Team=NULL, 
    draws=NULL,
    draw_n=NULL,
    quantiles=NULL,
    sample_n=NULL
  )

# repeat heuristics data frame for each worker 
stimuli_data_df <- stimuli_data_df[rep(seq_len(nrow(stimuli_data_df)), times=length(unique(responses_df$worker_id))),]
stimuli_data_df$worker_id <- sort(rep(unique(responses_df$worker_id), each=(length(unique(responses_df$ground_truth))) * length(unique(responses_df$sd_diff))))

# calculate the baseline of relative mean difference heuristic)
stimuli_data_df$max_abs_mean_diff <- max(abs(stimuli_data_df$mean_diff))

We need the data in a format where it is prepared for modeling. We need to get the stimuli-generating data and the worker response data in a single dataframe with one row per worker * trial and convert response and ground truth units to log odds.

# create data frame for model by merging stimuli-generating data with responses
model_df_llo <- stimuli_data_df %>%
  mutate( # create rounded version of ground_truth to merge on, leaving unrounded value stored in odds_of_victory
    ground_truth=round(odds_of_victory, 3)
  ) %>%
  full_join(responses_df, by=c("worker_id", "sd_diff", "ground_truth")) %>%
  rename( # rename ground_truth columns, so it is clear which is rounded and which should be used in the model
    ground_truth_rounded=ground_truth,
    ground_truth=odds_of_victory
  ) %>% 
  mutate( # apply logit function to cles judgments and ground truth
    lo_cles=qlogis(cles / 100),
    lo_ground_truth=qlogis(ground_truth),
    sd_diff=as.factor(sd_diff)
  )

Distribution of CLES

We start as simple as possible by just modeling the distribution of CLES on the log odds scale.

Let’s check that our priors seem reasonable.

# prior predictive check
n <- 1e4
tibble(sample_mu    = rnorm(n, mean = 0, sd = 1),
       sample_sigma = rnorm(n, mean = 0, sd = 1)) %>% 
  mutate(lo_cles = rnorm(n, mean = sample_mu, sd = sample_sigma),
         cles = plogis(lo_cles)) %>% 
  ggplot(aes(x = cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = expression(paste("Prior predictive distribution for ", italic(h[i]))),
       lo_cles = NULL) +
  theme(panel.grid = element_blank())
## Warning in rnorm(n, mean = sample_mu, sd = sample_sigma): NAs produced
## Warning: Removed 5018 rows containing non-finite values (stat_density).

Fit an intercept model.

# starting as simple as possible: learn the distribution of lo_cles
m.lo_cles <- brm(data = model_df_llo, family = gaussian,
              lo_cles ~ 1,
              prior = c(prior(normal(0, 1), class = Intercept),
                        prior(normal(0, 1), class = sigma)),
              iter = 3000, warmup = 500, chains = 2, cores = 2)
## Compiling the C++ model
## Start sampling

Check diagnostics:

# trace plots
plot(m.lo_cles)

# pairs plot
pairs(m.lo_cles)

# model summary
print(m.lo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ 1 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept     0.03      0.05    -0.08     0.13       4567 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     1.50      0.04     1.43     1.58       4225 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s get a sense of the distribution of posterior draws.

# get posterior draws
post <- posterior_samples(m.lo_cles)

# get summary stats on distribution of each parameter
post %>% 
  select(-lp__) %>% # drop log probability of posterior draws
  gather(parameter) %>%
  group_by(parameter) %>%
  mean_qi(value)
## # A tibble: 2 x 7
##   parameter    value  .lower .upper .width .point .interval
##   <chr>        <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 b_Intercept 0.0262 -0.0765  0.133   0.95 mean   qi       
## 2 sigma       1.50    1.43    1.58    0.95 mean   qi

Linear Log Odds Model of CLES

Now well add in a slope parameter. Hopefully, this will reduce the residual variance.

Let’s check that our priors seem reasonable.

# prior predictive check
n <- 1e4
tibble(intercept    = rnorm(n, mean = 0, sd = 1),
       slope        = rnorm(n, mean = 0, sd = 1),
       sample_sigma = rnorm(n, mean = 0, sd = 1),
       lo_ground_truth = list(qlogis(unique(stimuli_data_df$odds_of_victory)))) %>% 
  unnest() %>%
  mutate(lo_cles = rnorm(n * length(unique(stimuli_data_df$odds_of_victory)), mean = intercept + slope * lo_ground_truth, sd = sample_sigma),
         cles = plogis(lo_cles)) %>% 
  ggplot(aes(x = cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = expression(paste("Prior predictive distribution for ", italic(h[i]))),
       lo_cles = NULL) +
  theme(panel.grid = element_blank())
## Warning in rnorm(n * length(unique(stimuli_data_df$odds_of_victory)), mean
## = intercept + : NAs produced
## Warning: Removed 50520 rows containing non-finite values (stat_density).

What we get using generic weakly informative priors seems reasonable given that we are sampling more at extreme levels of ground truth than values in the middle of the scale.

Now, let’s fit the LLO model.

# linear log odds model: lo_cles ~ 1 + lo_ground_truth
m.llo_cles <- brm(data = model_df_llo, family = gaussian,
              lo_cles ~ 1 + lo_ground_truth,
              prior = c(prior(normal(0, 1), class = Intercept),
                        prior(normal(0, 1), class = b),
                        prior(normal(0, 1), class = sigma)),
              iter = 3000, warmup = 500, chains = 2, cores = 2)
## Compiling the C++ model
## Start sampling

Check diagnostics:

# trace plots
plot(m.llo_cles)

# pairs plot
pairs(m.llo_cles)

# model summary
print(m.llo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ 1 + lo_ground_truth 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##                 Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept           0.03      0.05    -0.07     0.12       6508 1.00
## lo_ground_truth     0.30      0.02     0.26     0.34       4745 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     1.32      0.03     1.26     1.39       4411 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s check out a posterior predictive distribution for CLES.

# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth) %>%
  add_predicted_draws(m.llo_cles, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

The posterior predictive distributions seems too wide. How does the posterior predictions compare to the observed data?

# data density
model_df_llo %>%
  ggplot(aes(x=cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Data distribution for CLES",
       cles = NULL) +
  theme(panel.grid = element_blank())

This seems to suggest that the data generating process is a 50% inflated mixture.

Let’s take a look at some of the estimated linear models.

# get posterior samples
post  <- posterior_samples(m.llo_cles)

# plot estimated linear models against observed data
model_df_llo %>%
ggplot(aes(x = lo_ground_truth, y = lo_cles)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  geom_abline(intercept = post[1:20, 1],
              slope     = post[1:20, 2],
              size = 1/3, alpha = .3) +
  geom_point(shape = 1, size = 2, color = "royalblue") +
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank())

Add Different Linear Models per Visualization Condition

In the LLO framework, what we really want to know about is the impact of visualization condition on the slopes of linear models in log odds space. Do some visualizations lead to more extreme patterns of bias than others? To test this, we’ll add an interaction between visualization condition and the ground truth.

# update the llo model of cles responses to include an interaction
m.vis.llo_cles <- update(m.llo_cles, 
                         # formula = lo_cles ~ 1 + lo_ground_truth + condition + lo_ground_truth:condition,
                         formula = lo_cles ~ 1 + lo_ground_truth + lo_ground_truth:condition,
                         newdata = model_df_llo)
## Start sampling

Check diagnostics:

# trace plots
plot(m.vis.llo_cles)

# pairs plot
pairs(m.vis.llo_cles)

# model summary
print(m.vis.llo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ lo_ground_truth + lo_ground_truth:condition 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##                                            Estimate Est.Error l-95% CI
## Intercept                                      0.03      0.04    -0.06
## lo_ground_truth                                0.57      0.03     0.51
## lo_ground_truth:conditionintervals_w_means    -0.39      0.04    -0.47
## lo_ground_truth:conditionmeans_only           -0.44      0.05    -0.54
##                                            u-95% CI Eff.Sample Rhat
## Intercept                                      0.12       4986 1.00
## lo_ground_truth                                0.63       2643 1.00
## lo_ground_truth:conditionintervals_w_means    -0.30       2915 1.00
## lo_ground_truth:conditionmeans_only           -0.35       3217 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     1.24      0.03     1.18     1.30       5383 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s check out a posterior predictive distribution for CLES. This should look more like a mixture with different slopes, but it still looks really wide.

# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth,condition) %>%
  add_predicted_draws(m.vis.llo_cles, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

What do the posterior for the effect of each visualization condition look like?

# use posterior samples to define distributions for the slope in each visualization condition
posterior_samples(m.vis.llo_cles) %>%
  transmute(slope_HOPs = b_lo_ground_truth,
            slope_intervals_w_means = b_lo_ground_truth + `b_lo_ground_truth:conditionintervals_w_means`,
            slope_means_only = b_lo_ground_truth + `b_lo_ground_truth:conditionmeans_only`) %>%
  gather(key, value) %>%
  ggplot(aes(x = value, group = key, color = key, fill = key)) +
  geom_density(alpha = 0.3) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) +
  scale_x_continuous(expression(slope), expand = c(0, 0)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior for slopes by visualization condition") +
  theme(panel.grid = element_blank())

This suggests that users are more biased toward responses of 50% when the mean is visually salient.

Let’s take a look at some of the estimated linear models per visualization condition.

# set up new dataframe for prediction
nd <- tibble(lo_ground_truth = seq(from = quantile(model_df_llo$lo_ground_truth, 0), to = quantile(model_df_llo$lo_ground_truth, 1), length.out = 30) %>% 
         rep(., times = 3),
         condition = rep(unique(model_df_llo$condition), each = 30))

# pass our new predictive dataframe through our fitted model
fitd_m.vis.llo_cles <- fitted(m.vis.llo_cles, newdata = nd) %>%
  as_tibble() %>%
  bind_cols(nd)
# plot estimated linear models against observed data
model_df_llo %>%
  ggplot(aes(x = lo_ground_truth)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  geom_ribbon(data = fitd_m.vis.llo_cles,
              aes(ymin  = Q2.5, 
                  ymax  = Q97.5,
                  fill  = condition,
                  group = condition),
              alpha = .25) +
  geom_line(data = fitd_m.vis.llo_cles,
              aes(y     = Estimate, 
                  color = condition,
                  group = condition)) +
  geom_point(aes(y = lo_cles, color = condition)) +
    scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) +
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  facet_grid(. ~ condition)

This looks pretty promising, but let’s see if we can parse some of the variability into error vs individual variability in slopes.

Add Hierarchy for Slopes

In the LLO framework, what we really want to know about is the impact of visualization condition on the slopes of linear models in log odds space. Do some visualizations lead to more extreme patterns of bias than others? To test this, we’ll add an interaction between visualization condition and the ground truth.

# update the llo model of cles responses to include an interaction
m.vis.wrkr.llo_cles <- update(m.llo_cles,
                         formula = lo_cles ~ 1 + (lo_ground_truth|worker_id) + lo_ground_truth:condition,
                         newdata = model_df_llo)
## The desired updates require recompiling the model
## Compiling the C++ model
## Start sampling

Check diagnostics:

# trace plots
plot(m.vis.wrkr.llo_cles)

# pairs plot
pairs(m.vis.wrkr.llo_cles)

# model summary
print(m.vis.wrkr.llo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ (lo_ground_truth | worker_id) + lo_ground_truth:condition 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Group-Level Effects: 
## ~worker_id (Number of levels: 39) 
##                                Estimate Est.Error l-95% CI u-95% CI
## sd(Intercept)                      0.55      0.08     0.42     0.71
## sd(lo_ground_truth)                0.31      0.04     0.24     0.40
## cor(Intercept,lo_ground_truth)    -0.00      0.18    -0.35     0.36
##                                Eff.Sample Rhat
## sd(Intercept)                        1588 1.00
## sd(lo_ground_truth)                  1513 1.00
## cor(Intercept,lo_ground_truth)       1314 1.00
## 
## Population-Level Effects: 
##                                            Estimate Est.Error l-95% CI
## Intercept                                      0.03      0.09    -0.15
## lo_ground_truth:conditionHOPs                  0.56      0.09     0.38
## lo_ground_truth:conditionintervals_w_means     0.18      0.09     0.00
## lo_ground_truth:conditionmeans_only            0.12      0.09    -0.05
##                                            u-95% CI Eff.Sample Rhat
## Intercept                                      0.21       1229 1.00
## lo_ground_truth:conditionHOPs                  0.74       1547 1.00
## lo_ground_truth:conditionintervals_w_means     0.35       1601 1.00
## lo_ground_truth:conditionmeans_only            0.31       1545 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     0.91      0.02     0.87     0.96       5018 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s check out a posterior predictive distribution for CLES. This distribution is looking narrower than in previous iterations of the model, which suggests that individual variation in slopes accounts for some (but not all) of the 50% responses.

# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth,condition,worker_id) %>%
  add_predicted_draws(m.vis.wrkr.llo_cles, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

What do the posterior for the effect of each visualization condition look like?

# use posterior samples to define distributions for the slope in each visualization condition
posterior_samples(m.vis.wrkr.llo_cles) %>%
  # transmute(slope_HOPs = `b_conditionHOPs:lo_ground_truth`,
  #           slope_intervals_w_means = `b_conditionintervals_w_means:lo_ground_truth`,
  #           slope_means_only = `b_conditionmeans_only:lo_ground_truth`) %>%
  transmute(slope_HOPs = `b_lo_ground_truth:conditionHOPs`,
            slope_intervals_w_means = `b_lo_ground_truth:conditionintervals_w_means`,
            slope_means_only = `b_lo_ground_truth:conditionmeans_only`) %>%
  gather(key, value) %>%
  ggplot(aes(x = value, group = key, color = key, fill = key)) +
  geom_density(alpha = 0.3) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) +
  scale_x_continuous(expression(slope), expand = c(0, 0)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior for slopes by visualization condition") +
  theme(panel.grid = element_blank())

The effect we saw earlier is still present.

Let’s take a look at some of the estimated linear models per visualization condition.

# this time we'll adopt functions from the tidybayes package to make plotting posterior predictions easier
model_df_llo %>%
  group_by(condition,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.vis.wrkr.llo_cles) %>%
  ggplot(aes(x = lo_ground_truth, y = lo_cles, color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = .prediction), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) + 
  facet_grid(. ~ condition)

What does this look like in probability units?

# this time we'll adopt functions from the tidybayes package to make plotting posterior predictions easier
model_df_llo %>%
  group_by(condition,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.vis.wrkr.llo_cles) %>%
  ggplot(aes(x = plogis(lo_ground_truth), y = plogis(lo_cles), color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = plogis(.prediction)), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(plogis(model_df_llo$lo_ground_truth), c(0, 1)),
                  ylim = quantile(plogis(model_df_llo$lo_cles), c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) + 
  facet_grid(. ~ condition)

This is very promising, but there are areas of high posterior density in the interals_w_means and means_only conditions where there are no observations. Maybe adding the level of uncertainty shown in each plot to the model will improve the fit?

Add an Interaction Between the Level of Uncertainty in the Data and Slopes

# update the llo model of cles responses to include an interaction
m.vis.sd.wrkr.llo_cles <- update(m.llo_cles, 
                         formula = lo_cles ~ 1 + (lo_ground_truth|worker_id) + lo_ground_truth:condition:sd_diff,
                         newdata = model_df_llo)
## The desired updates require recompiling the model
## Compiling the C++ model
## recompiling to avoid crashing R session
## Start sampling

Check diagnostics:

# trace plots
plot(m.vis.sd.wrkr.llo_cles)

# pairs plot
pairs(m.vis.sd.wrkr.llo_cles)

# model summary
print(m.vis.sd.wrkr.llo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ (lo_ground_truth | worker_id) + lo_ground_truth:condition:sd_diff 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Group-Level Effects: 
## ~worker_id (Number of levels: 39) 
##                                Estimate Est.Error l-95% CI u-95% CI
## sd(Intercept)                      0.55      0.08     0.42     0.72
## sd(lo_ground_truth)                0.31      0.04     0.24     0.40
## cor(Intercept,lo_ground_truth)    -0.01      0.18    -0.36     0.35
##                                Eff.Sample Rhat
## sd(Intercept)                        1610 1.00
## sd(lo_ground_truth)                  1663 1.00
## cor(Intercept,lo_ground_truth)       1412 1.00
## 
## Population-Level Effects: 
##                                                     Estimate Est.Error
## Intercept                                               0.03      0.09
## lo_ground_truth:conditionHOPs:sd_diff1                  0.50      0.09
## lo_ground_truth:conditionintervals_w_means:sd_diff1     0.09      0.09
## lo_ground_truth:conditionmeans_only:sd_diff1           -0.02      0.09
## lo_ground_truth:conditionHOPs:sd_diff5                  0.62      0.09
## lo_ground_truth:conditionintervals_w_means:sd_diff5     0.27      0.09
## lo_ground_truth:conditionmeans_only:sd_diff5            0.26      0.09
##                                                     l-95% CI u-95% CI
## Intercept                                              -0.16     0.21
## lo_ground_truth:conditionHOPs:sd_diff1                  0.33     0.67
## lo_ground_truth:conditionintervals_w_means:sd_diff1    -0.09     0.27
## lo_ground_truth:conditionmeans_only:sd_diff1           -0.21     0.15
## lo_ground_truth:conditionHOPs:sd_diff5                  0.45     0.79
## lo_ground_truth:conditionintervals_w_means:sd_diff5     0.09     0.44
## lo_ground_truth:conditionmeans_only:sd_diff5            0.07     0.44
##                                                     Eff.Sample Rhat
## Intercept                                                 1010 1.00
## lo_ground_truth:conditionHOPs:sd_diff1                    1602 1.00
## lo_ground_truth:conditionintervals_w_means:sd_diff1       1194 1.00
## lo_ground_truth:conditionmeans_only:sd_diff1              1169 1.00
## lo_ground_truth:conditionHOPs:sd_diff5                    1631 1.00
## lo_ground_truth:conditionintervals_w_means:sd_diff5       1238 1.00
## lo_ground_truth:conditionmeans_only:sd_diff5              1147 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     0.88      0.02     0.83     0.93       6817 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s check out a posterior predictive distribution for CLES. This still looks too wide considering the number of 50% responses we see in the data.

# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth,condition,sd_diff,worker_id) %>%
  add_predicted_draws(m.vis.sd.wrkr.llo_cles, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

What do the posterior for the effect of each visualization condition look like?

# use posterior samples to define distributions for the slope in each visualization condition
posterior_samples(m.vis.sd.wrkr.llo_cles) %>%
  transmute(slope_HOPs = `b_lo_ground_truth:conditionHOPs:sd_diff1` + `b_lo_ground_truth:conditionHOPs:sd_diff5`,
            slope_intervals_w_means = `b_lo_ground_truth:conditionintervals_w_means:sd_diff1` + `b_lo_ground_truth:conditionintervals_w_means:sd_diff5`,
            slope_means_only = `b_lo_ground_truth:conditionmeans_only:sd_diff1` + `b_lo_ground_truth:conditionmeans_only:sd_diff5`) %>%
  gather(key, value) %>%
  ggplot(aes(x = value, group = key, color = key, fill = key)) +
  geom_density(alpha = 0.3) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) +
  scale_x_continuous(expression(slope), expand = c(0, 0)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior for slopes by visualization condition") +
  theme(panel.grid = element_blank())

Let’s take a look at some of the estimated linear models per visualization condition.

# this time we'll adopt functions from the tidybayes package to make plotting posterior predictions easier
model_df_llo %>%
  group_by(condition,sd_diff,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.vis.sd.wrkr.llo_cles) %>%
  ggplot(aes(x = lo_ground_truth, y = lo_cles, color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = .prediction), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) + 
  facet_grid(. ~ condition)

What does this look like in probability units?

model_df_llo %>%
  group_by(condition,sd_diff,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.vis.sd.wrkr.llo_cles) %>%
  ggplot(aes(x = plogis(lo_ground_truth), y = plogis(lo_cles), color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = plogis(.prediction)), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(plogis(model_df_llo$lo_ground_truth), c(0, 1)),
                  ylim = quantile(plogis(model_df_llo$lo_cles), c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) + 
  facet_grid(sd_diff ~ condition)

This adding an interaction for sd_diff doesn’t really fix the problem that our posterior predictions have high density where there are few responses. However it does clarify that the differences between visualization conditions are more pronounced when sd_diff is low.

Let’s check whether adding parameters to account for sd_diff is worth it insofar as the added parameters contribute more to predictive validity than they contribute to overfitting. We’ll determine this by comparing the models with and without parameters for sd_diff according to the widely applicable information criterion (WAIC).

waic(m.vis.wrkr.llo_cles, m.vis.sd.wrkr.llo_cles)
##                                                 WAIC    SE
## m.vis.wrkr.llo_cles                          2156.51 84.86
## m.vis.sd.wrkr.llo_cles                       2105.15 85.46
## m.vis.wrkr.llo_cles - m.vis.sd.wrkr.llo_cles   51.37 14.34

This suggests that it is better to include the parameters for sd_diff in this model.

A Mixture Model of Fake CLES Responses

In order to help our model accommodate the fact that some workers choose the same response regardless of the ground truth, we will need a mixture between two submodels: a LLO process vs a constant response process. To get this working, we’ll start by building a fake data set with a known data-generating process.

Fake Data

This simulated data is a 20/80 mixture of the ground truth vs a constant response of 99.75%, both with standard normal noise added. Let’s prep the data for the model.

# parameters to recover
p_heuristic <- c(0.2, 0.8) # population probabilities of each heuristic
Sigma <- matrix(c(1, .1, .1, 1), 2, 2) # covariance matrix
sigma_err <- 1 # residual error in log odds units

# number of trials per participant (should be a multiple of 20 data conditions)
n_trials_per_worker <- 200
n_trial_scalar <- n_trials_per_worker / 20

# softmax function
softmax <- function(x) {
  return(exp(x) / sum(exp(x)))
}

# heuristic predictions on each trial: ground_truth vs guess_one
heuristics_df_hops <- stimuli_data_df %>% rowwise() %>% 
  mutate( # heuristic functions
    ground_truth = odds_of_victory * 100,
    guess_one = 99.75
  ) %>% 
  gather(heuristic, heuristic_est, ground_truth, guess_one) %>% # reshape
  rename(ground_truth = odds_of_victory) %>%
  select(worker_id, sd_diff, ground_truth, heuristic, heuristic_est) %>%
  mutate(sd_diff = as.factor(sd_diff))

# calculate p_heuristic per worker (add hierarchy)
fake_worker_params_df <- model_df_llo %>%
  select(worker_id) %>%
  distinct(worker_id) %>%
  rowwise() %>%
  mutate(
    p_heuristic_worker = list(p_heuristic)) # no hierarchy for p_heuristic values
  # mutate(mu_lo_heuristic_worker = list(MASS::mvrnorm(n(), qlogis(p_heuristic), Sigma))) %>% # intermediate calculation: draws from multivariate normal
  # mutate(
  #   p_heuristic_worker = list(softmax(mu_lo_heuristic_worker)), # what we use to simulate observations for each worker
  #   mu_lo_heuristic_worker = NULL
  # )

# fake data
fake_df_llo_mix <- model_df_llo %>%
  left_join(fake_worker_params_df, by="worker_id") %>%
  slice(rep(1:n(), each=n_trial_scalar)) %>%
  rowwise() %>%
  mutate(heuristic = sample(x=c("ground_truth", "guess_one"), size = n(), replace=TRUE, prob=p_heuristic_worker)) %>% # sample heuristic to use on each trial
  left_join(heuristics_df_hops, by=c("worker_id", "sd_diff", "ground_truth", "heuristic")) %>% # add heuristic estimates to model_df
  select(-cles) %>% # remove actual responses 
  mutate(
    lo_cles = qlogis(heuristic_est / 100) + rnorm(n(), 0, sigma_err), # likelihood function
    cles = plogis(lo_cles) * 100, # simulated responses from known data generating process
    lo_ground_truth=qlogis(ground_truth)
  )

We fit a model that matches the data-generating process, and importantly, we estimate the log odds of the constant response strategy theta2.

# get_prior(
#   bf(lo_cles ~ 1, mu1 ~ (lo_ground_truth|worker_id), mu2 ~ (1|worker_id), theta2 ~ 1),
#     data = fake_df_llo_mix,
#     family = mixture(gaussian, gaussian)
# )

# fit the model
m.fake.llo_mix <- brm(
  bf(lo_cles ~ 1, mu1 ~ lo_ground_truth, mu2 ~ 1, theta2 ~ 1),
  data = fake_df_llo_mix,
  family = mixture(gaussian, gaussian, order = 'mu'),
  prior = c(
    prior(normal(0, 1), class = Intercept, dpar = mu1),
    prior(normal(0, 1), class = Intercept, dpar = mu2)
  ),
  inits = 1, chains = 1, cores = 2,
  control = list(adapt_delta = 0.999, max_treedepth=12)
)
## Compiling the C++ model
## Start sampling
## 
## SAMPLING FOR MODEL '9265f6ccc7bb94087f5831738af8939d' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.006736 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 67.36 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 227.657 seconds (Warm-up)
## Chain 1:                184.984 seconds (Sampling)
## Chain 1:                412.641 seconds (Total)
## Chain 1:

Check diagnostics:

  • Trace plots
# trace plots
plot(m.fake.llo_mix)

  • Pairs plot
# pairs plot
pairs(m.fake.llo_mix)
## Warning in bayesplot::mcmc_pairs(samples, ...): Only one chain in 'x'. This
## plot is more useful with multiple chains.

  • Summary
# model summary
print(m.fake.llo_mix)
##  Family: mixture(gaussian, gaussian) 
##   Links: mu1 = identity; sigma1 = identity; mu2 = identity; sigma2 = identity; theta1 = identity; theta2 = identity 
## Formula: lo_cles ~ 1 
##          mu1 ~ lo_ground_truth
##          mu2 ~ 1
##          theta2 ~ 1
##    Data: fake_df_llo_mix (Number of observations: 7800) 
## Samples: 1 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 1000
## 
## Population-Level Effects: 
##                     Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## mu1_Intercept          -0.01      0.03    -0.06     0.04        395 1.00
## mu2_Intercept           5.98      0.01     5.96     6.01       1025 1.00
## theta2_Intercept        1.37      0.03     1.32     1.43        566 1.00
## mu1_lo_ground_truth     1.00      0.01     0.97     1.02        622 1.00
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma1     0.98      0.02     0.94     1.02        828 1.00
## sigma2     1.00      0.01     0.98     1.02        756 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

So, did we accurately recover the mixture proportions? Let’s find out by plotting the posterior for theta. Because theta is in log odds units we’ll transform it into probability units.

# posteriors of mixture proportion
posterior_samples(m.fake.llo_mix) %>%
  mutate(p_mix = plogis(b_theta2_Intercept)) %>%
  ggplot(aes(x=p_mix)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior distribution for mixture proportion") +
  theme(panel.grid = element_blank())

Yes, we did it! The ability to estimate the proportion of each alternative within a mixture should help us to dramatically improve our models of real data.

Let’s also check out a posterior predictive distribution for the fake CLES responses.

# posterior predictive check
fake_df_llo_mix %>%
  select(lo_ground_truth,condition,sd_diff,worker_id) %>%
  add_predicted_draws(m.fake.llo_mix, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

And lets’s compare this to the data distribution.

# posterior predictive check
fake_df_llo_mix %>%
  ggplot(aes(x=cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Distribution of fake CLES responses",
       cles = NULL) +
  theme(panel.grid = element_blank())

A Mixture of the LLO Model vs A Random Constant Response

Let’s adopt this mixture model for our actual CLES responses by adding a constant response distribution to our most recent iteration of the LLO model.

m.cles.llo_mix <- brm(
  bf(lo_cles ~ 1, 
    mu1 ~ (1 + lo_ground_truth|worker_id) + lo_ground_truth:condition:sd_diff, # our most recent llo model
    mu2 ~ (1|worker_id), # random constant response per worker (to account for people who always answer the same, often but not always 50%)
    theta2 ~ (1|worker_id) + condition # the proportion of responses that are constant
  ),
  data = model_df_llo,
  family = mixture(gaussian, gaussian, order = 'mu'),
  prior = c(
    prior(normal(0, 1), class = Intercept, dpar = mu1),
    prior(normal(0, 1), class = Intercept, dpar = mu2)
  ),
  inits = 1, chains = 2, cores = 2,
  control = list(adapt_delta = 0.999, max_treedepth=15),
  file = "stan/m_cles_llo_mix"
)

Check diagnostics:

# trace plots
plot(m.cles.llo_mix)

# pairs plot (too many things to view at once, so we've grouped them)
# mixture proportions
pairs(m.cles.llo_mix, pars = c("b_theta2_Intercept",
                               "sd_worker_id__theta2_Intercept",
                               "b_theta2_conditionHOPs",
                               "b_theta2_conditionmeans_only",
                               "b_theta2_conditionintervals_w_means"))

# pairs plot (too many things to view at once, so we've grouped them)
# hyperparameters
pairs(m.cles.llo_mix, pars = c("b_mu1_Intercept",
                               "sd_worker_id__mu1_Intercept",
                               "sd_worker_id__mu1_lo_ground_truth",
                               "cor_worker_id__mu1_Intercept__mu1_lo_ground_truth",
                               "sigma1",
                               "b_mu2_Intercept",
                               "sd_worker_id__mu2_Intercept",
                               "sigma2"))

# pairs plot (too many things to view at once, so we've grouped them)
# slope effects
pairs(m.cles.llo_mix, pars = c("b_mu1_lo_ground_truth:conditionHOPs:sd_diff1",
                               "b_mu1_lo_ground_truth:conditionintervals_w_means:sd_diff1",
                               "b_mu1_lo_ground_truth:conditionmeans_only:sd_diff1",
                               "b_mu1_lo_ground_truth:conditionHOPs:sd_diff5",
                               "b_mu1_lo_ground_truth:conditionintervals_w_means:sd_diff5",
                               "b_mu1_lo_ground_truth:conditionmeans_only:sd_diff5"))

# model summary
print(m.cles.llo_mix)
##  Family: mixture(gaussian, gaussian) 
##   Links: mu1 = identity; sigma1 = identity; mu2 = identity; sigma2 = identity; theta1 = identity; theta2 = identity 
## Formula: lo_cles ~ 1 
##          mu1 ~ (1 + lo_ground_truth | worker_id) + lo_ground_truth:condition:sd_diff
##          mu2 ~ (1 | worker_id)
##          theta2 ~ (1 | worker_id) + condition
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 2000
## 
## Group-Level Effects: 
## ~worker_id (Number of levels: 39) 
##                                        Estimate Est.Error l-95% CI
## sd(mu1_Intercept)                          1.15      0.19     0.85
## sd(mu1_lo_ground_truth)                    0.36      0.06     0.26
## sd(mu2_Intercept)                          0.01      0.01     0.00
## sd(theta2_Intercept)                       6.85      2.34     3.74
## cor(mu1_Intercept,mu1_lo_ground_truth)    -0.02      0.21    -0.43
##                                        u-95% CI Eff.Sample Rhat
## sd(mu1_Intercept)                          1.59        567 1.01
## sd(mu1_lo_ground_truth)                    0.50        568 1.00
## sd(mu2_Intercept)                          0.03       1135 1.00
## sd(theta2_Intercept)                      13.20        433 1.01
## cor(mu1_Intercept,mu1_lo_ground_truth)     0.38        749 1.00
## 
## Population-Level Effects: 
##                                                         Estimate Est.Error
## mu1_Intercept                                              -0.16      0.13
## mu2_Intercept                                               0.00      0.01
## theta2_Intercept                                           -7.69      3.02
## mu1_lo_ground_truth:conditionHOPs:sd_diff1                  0.52      0.11
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff1     0.13      0.14
## mu1_lo_ground_truth:conditionmeans_only:sd_diff1            0.04      0.17
## mu1_lo_ground_truth:conditionHOPs:sd_diff5                  0.64      0.10
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff5     0.39      0.14
## mu1_lo_ground_truth:conditionmeans_only:sd_diff5            0.36      0.16
## theta2_conditionintervals_w_means                           8.83      4.38
## theta2_conditionmeans_only                                 10.94      4.64
##                                                         l-95% CI u-95% CI
## mu1_Intercept                                              -0.49    -0.00
## mu2_Intercept                                              -0.01     0.02
## theta2_Intercept                                          -14.86    -3.20
## mu1_lo_ground_truth:conditionHOPs:sd_diff1                  0.33     0.73
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff1    -0.15     0.40
## mu1_lo_ground_truth:conditionmeans_only:sd_diff1           -0.31     0.37
## mu1_lo_ground_truth:conditionHOPs:sd_diff5                  0.45     0.84
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff5     0.12     0.65
## mu1_lo_ground_truth:conditionmeans_only:sd_diff5            0.04     0.67
## theta2_conditionintervals_w_means                           1.99    18.39
## theta2_conditionmeans_only                                  4.41    21.88
##                                                         Eff.Sample Rhat
## mu1_Intercept                                                  259 1.00
## mu2_Intercept                                                 1857 1.00
## theta2_Intercept                                               346 1.01
## mu1_lo_ground_truth:conditionHOPs:sd_diff1                     394 1.01
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff1        459 1.01
## mu1_lo_ground_truth:conditionmeans_only:sd_diff1               458 1.00
## mu1_lo_ground_truth:conditionHOPs:sd_diff5                     383 1.00
## mu1_lo_ground_truth:conditionintervals_w_means:sd_diff5        452 1.01
## mu1_lo_ground_truth:conditionmeans_only:sd_diff5               412 1.00
## theta2_conditionintervals_w_means                              266 1.01
## theta2_conditionmeans_only                                     262 1.01
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma1     0.96      0.04     0.90     1.04       1908 1.00
## sigma2     0.12      0.01     0.11     0.13       2611 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

There’s a couple problems with this model: - First, the posterior samples are highly correlated among the estimates of the prevalence of the constant response strategy in each visualization condition. It is clear to me that these three parameters should have a multivariate normal prior (i.e., the model should know about this correlation between conditions), so we’ll try that next. - The second issue I’m seeing in the pairs plots is that the interaction effects for different levels of sd_diff within the same visualization condition are highly correlated. I’m wondering if this means I should remove sd_diff from the model? We’ll try this as well. Based on the performance of the model below, I’m thinking that the submodel representing a constant response pattern is redundant with including sd_diff in the llo model insofar as both account for the high frequency of responses near 50%, but the mixture seems to be a better match to the data-generating process based on the posterior density of the mixture model.

Revised Mixture of the LLO Model vs A Random Constant Response

This is an adaptation of the previous model which attempts to remedy the issues we saw in the pairs plots.

# define stanvars for multi_normal prior on condition effects
stanvars <- stanvar(rep(1, 3), "mu_theta2", scode = "  vector[3] mu_theta2;") +
  stanvar(diag(3), "Sigma_theta2", scode = "  matrix[3, 3] Sigma_theta2;")
# fit the model
m.cles.llo_mix2 <- brm(
   bf(lo_cles ~ 1, 
    mu1 ~ (1 + lo_ground_truth|worker_id) + lo_ground_truth:condition, # our most recent llo model
    mu2 ~ (1|worker_id), # random constant response per worker (to account for people who always answer the same, often but not always 50%)
    theta2 ~ (1|worker_id) + 0 + condition # the proportion of responses that are constant
  ),
  data = model_df_llo,
  family = mixture(gaussian, gaussian, order = 'mu'),
  prior = c(
    prior(normal(0, 1), class = Intercept, dpar = mu1),
    prior(normal(0, 1), class = Intercept, dpar = mu2),
    prior("multi_normal(mu_theta2, Sigma_theta2)", class = b, dpar = theta2)
  ),
  stanvars = stanvars,
  inits = 1, chains = 2, cores = 2,
  control = list(adapt_delta = 0.999, max_treedepth=15),
  file = "stan/m_cles_llo_mix2"
)

Check diagnostics:

# trace plots
plot(m.cles.llo_mix2)

# pairs plot (too many things to view at once, so we've grouped them)
# mixture proportions
pairs(m.cles.llo_mix2, pars = c("sd_worker_id__theta2_Intercept",
                               "b_theta2_conditionHOPs",
                               "b_theta2_conditionmeans_only",
                               "b_theta2_conditionintervals_w_means"))

# pairs plot (too many things to view at once, so we've grouped them)
# hyperparameters
pairs(m.cles.llo_mix2, pars = c("b_mu1_Intercept",
                               "sd_worker_id__mu1_Intercept",
                               "sd_worker_id__mu1_lo_ground_truth",
                               "cor_worker_id__mu1_Intercept__mu1_lo_ground_truth",
                               "sigma1",
                               "b_mu2_Intercept",
                               "sd_worker_id__mu2_Intercept",
                               "sigma2"))

# pairs plot (too many things to view at once, so we've grouped them)
# slope effects
pairs(m.cles.llo_mix2, pars = c("b_mu1_lo_ground_truth:conditionHOPs",
                               "b_mu1_lo_ground_truth:conditionmeans_only",
                               "b_mu1_lo_ground_truth:conditionintervals_w_means"))

# model summary
print(m.cles.llo_mix2)
##  Family: mixture(gaussian, gaussian) 
##   Links: mu1 = identity; sigma1 = identity; mu2 = identity; sigma2 = identity; theta1 = identity; theta2 = identity 
## Formula: lo_cles ~ 1 
##          mu1 ~ (1 + lo_ground_truth | worker_id) + lo_ground_truth:condition
##          mu2 ~ (1 | worker_id)
##          theta2 ~ (1 | worker_id) + 0 + condition
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 2000
## 
## Group-Level Effects: 
## ~worker_id (Number of levels: 39) 
##                                        Estimate Est.Error l-95% CI
## sd(mu1_Intercept)                          1.23      0.20     0.89
## sd(mu1_lo_ground_truth)                    0.35      0.06     0.26
## sd(mu2_Intercept)                          0.01      0.01     0.00
## sd(theta2_Intercept)                       6.16      1.97     3.45
## cor(mu1_Intercept,mu1_lo_ground_truth)    -0.05      0.22    -0.46
##                                        u-95% CI Eff.Sample Rhat
## sd(mu1_Intercept)                          1.70        581 1.00
## sd(mu1_lo_ground_truth)                    0.48        572 1.00
## sd(mu2_Intercept)                          0.03       1125 1.00
## sd(theta2_Intercept)                      11.00        563 1.01
## cor(mu1_Intercept,mu1_lo_ground_truth)     0.36        720 1.00
## 
## Population-Level Effects: 
##                                                Estimate Est.Error l-95% CI
## mu1_Intercept                                     -0.18      0.13    -0.48
## mu2_Intercept                                      0.00      0.01    -0.01
## mu1_lo_ground_truth:conditionHOPs                  0.60      0.10     0.41
## mu1_lo_ground_truth:conditionintervals_w_means     0.27      0.12     0.04
## mu1_lo_ground_truth:conditionmeans_only            0.23      0.15    -0.07
## theta2_conditionHOPs                              -0.79      0.95    -2.59
## theta2_conditionintervals_w_means                  0.83      0.88    -0.92
## theta2_conditionmeans_only                         1.28      0.88    -0.49
##                                                u-95% CI Eff.Sample Rhat
## mu1_Intercept                                     -0.01        301 1.00
## mu2_Intercept                                      0.02       1884 1.00
## mu1_lo_ground_truth:conditionHOPs                  0.79        384 1.00
## mu1_lo_ground_truth:conditionintervals_w_means     0.52        529 1.00
## mu1_lo_ground_truth:conditionmeans_only            0.52        625 1.00
## theta2_conditionHOPs                               1.15       1011 1.00
## theta2_conditionintervals_w_means                  2.59       1146 1.01
## theta2_conditionmeans_only                         2.98        697 1.00
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma1     1.00      0.04     0.93     1.07       2015 1.00
## sigma2     0.12      0.01     0.11     0.13       2050 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Let’s check out a posterior predictive distribution for CLES.

# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth,condition,sd_diff,worker_id) %>%
  add_predicted_draws(m.cles.llo_mix2, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

How does this compare to the empirical distribution of CLES responses?

# posterior predictive check
model_df_llo %>%
  ggplot(aes(x=cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

What do the posterior for the effect of each visualization condition look like?

# use posterior samples to define distributions for the slope in each visualization condition
posterior_samples(m.cles.llo_mix2) %>%
  transmute(slope_HOPs = `b_mu1_lo_ground_truth:conditionHOPs`,
            slope_intervals_w_means = `b_mu1_lo_ground_truth:conditionintervals_w_means`,
            slope_means_only = `b_mu1_lo_ground_truth:conditionmeans_only`) %>%
  # transmute(slope_HOPs = `b_mu1_lo_ground_truth:conditionHOPs:sd_diff1` + `b_mu1_lo_ground_truth:conditionHOPs:sd_diff5`,
  #           slope_intervals_w_means = `b_mu1_lo_ground_truth:conditionintervals_w_means:sd_diff1` + `b_mu1_lo_ground_truth:conditionintervals_w_means:sd_diff5`,
  #           slope_means_only = `b_mu1_lo_ground_truth:conditionmeans_only:sd_diff1` + `b_mu1_lo_ground_truth:conditionmeans_only:sd_diff5`) %>%
  gather(key, value) %>%
  ggplot(aes(x = value, group = key, color = key, fill = key)) +
  geom_density(alpha = 0.3) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) +
  scale_x_continuous(expression(slope), expand = c(0, 0)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior for slopes by visualization condition") +
  theme(panel.grid = element_blank())

Let’s take a look at some of the estimated linear models per visualization condition.

# this time we'll adopt functions from the tidybayes package to make plotting posterior predictions easier
model_df_llo %>%
  group_by(condition,sd_diff,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.cles.llo_mix2) %>%
  ggplot(aes(x = lo_ground_truth, y = lo_cles, color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = .prediction), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  facet_grid(. ~ condition)

  # facet_grid(sd_diff ~ condition)

What does this look like in probability units?

model_df_llo %>%
  group_by(condition,sd_diff,worker_id) %>%
  data_grid(lo_ground_truth = seq_range(lo_ground_truth, n = 101)) %>%
  add_predicted_draws(m.cles.llo_mix2) %>%
  ggplot(aes(x = plogis(lo_ground_truth), y = plogis(lo_cles), color = condition, fill = condition)) +
  geom_abline(intercept = 0, slope = 1, size = 1, alpha = .3, color = "red", linetype = "dashed") + # ground truth
  stat_lineribbon(aes(y = plogis(.prediction)), .width = c(.95, .80, .50), alpha = 1/4) +
  geom_point(data = model_df_llo) +
  scale_fill_brewer(type = "qual", palette = 1) +
  scale_color_brewer(type = "qual", palette = 1) + 
  coord_cartesian(xlim = quantile(plogis(model_df_llo$lo_ground_truth), c(0, 1)),
                  ylim = quantile(plogis(model_df_llo$lo_cles), c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank()) + 
  facet_grid(. ~ condition)

  # facet_grid(sd_diff ~ condition)

What about the mixture proportions? Let’s plot the posterior for theta. Because theta is in log odds units we’ll transform it into probability units.

# posteriors of mixture proportion
posterior_samples(m.cles.llo_mix2) %>%
  transmute(#p_mix_HOPs = plogis(b_theta2_Intercept),
            p_mix_HOPs = plogis(b_theta2_conditionHOPs),
            p_mix_intervals_w_means = plogis(b_theta2_conditionintervals_w_means),
            p_mix_means_only = plogis(b_theta2_conditionmeans_only)) %>%
  gather(key, value) %>%
  ggplot(aes(x = value, group = key, color = key, fill = key)) +
  geom_density(alpha = 0.3) +
  # scale_fill_brewer(type = "qual", palette = 1) +
  # scale_color_brewer(type = "qual", palette = 1) +
  scale_x_continuous(expression(slope), expand = c(0, 0)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior for proportion of constant response by visualization condition") +
  theme(panel.grid = element_blank())

All of this looks good! Let’s double check that this is better than the other mixture according to WAIC.

waic(m.vis.wrkr.llo_cles, m.cles.llo_mix2)
##                                          WAIC    SE
## m.vis.wrkr.llo_cles                   2156.51 84.86
## m.cles.llo_mix2                       1156.55 86.65
## m.vis.wrkr.llo_cles - m.cles.llo_mix2  999.96 72.65

This is confirmation that the most recent LLO vs constant response mixture is a dramatic improvement.